// Copyright 2021 Matrix Origin
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "textflag.h"

// func crc32Int64BatchHash(data *uint64, hashes *uint64, length int)
// Requires: SSE4.2
TEXT ·crc32Int64BatchHash(SB), NOSPLIT, $0-24
	MOVQ data+0(FP), SI
	MOVQ hashes+8(FP), DI
	MOVQ length+16(FP), CX

loop:
	SUBQ $8, CX
	JL   tail

	VMOVDQU (SI), Y0
	VMOVDQU Y0, (DI)
	VMOVDQU 0x20(SI), Y1
	VMOVDQU Y1, 0x20(DI)

	MOVQ $-1, R8
	MOVQ $-1, R9
	MOVQ $-1, R10
	MOVQ $-1, R11
	MOVQ $-1, R12
	MOVQ $-1, R13
	MOVQ $-1, R14
	MOVQ $-1, R15

	CRC32Q 0x00(SI), R8
	CRC32Q 0x08(SI), R9
	CRC32Q 0x10(SI), R10
	CRC32Q 0x18(SI), R11
	CRC32Q 0x20(SI), R12
	CRC32Q 0x28(SI), R13
	CRC32Q 0x30(SI), R14
	CRC32Q 0x38(SI), R15

	MOVL R8, 0x00(DI)
	MOVL R9, 0x08(DI)
	MOVL R10, 0x10(DI)
	MOVL R11, 0x18(DI)
	MOVL R12, 0x20(DI)
	MOVL R13, 0x28(DI)
	MOVL R14, 0x30(DI)
	MOVL R15, 0x38(DI)

	ADDQ $0x40, SI
	ADDQ $0x40, DI
	JMP  loop

tail:
	ADDQ $8, CX
	JE   done

tailLoop:
	MOVQ   $-1, R8
	MOVQ   (SI), R9
	MOVQ   R9, (DI)
	CRC32Q (SI), R8
	MOVL   R8, (DI)

	ADDQ $0x08, SI
	ADDQ $0x08, DI
	LOOP tailLoop

done:
	RET

////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////

DATA Pi<>+0x00(SB)/8, $0x3243f6a8885a308d
DATA Pi<>+0x08(SB)/8, $0x313198a2e0370734
DATA Pi<>+0x10(SB)/8, $0x4a4093822299f31d
DATA Pi<>+0x18(SB)/8, $0x0082efa98ec4e6c8
DATA Pi<>+0x20(SB)/8, $0x9452821e638d0137
DATA Pi<>+0x28(SB)/8, $0x7be5466cf34e90c6
DATA Pi<>+0x30(SB)/8, $0xcc0ac29b7c97c50d
DATA Pi<>+0x38(SB)/8, $0xd3f84d5b5b547091
DATA Pi<>+0x40(SB)/8, $0x79216d5d98979fb1
DATA Pi<>+0x48(SB)/8, $0xbd1310ba698dfb5a
DATA Pi<>+0x50(SB)/8, $0xc2ffd72dbd01adfb
DATA Pi<>+0x58(SB)/8, $0x7b8e1afed6a267e9
DATA Pi<>+0x60(SB)/8, $0x6ba7c9045f12c7f9
DATA Pi<>+0x68(SB)/8, $0x924a19947b3916cf
DATA Pi<>+0x70(SB)/8, $0x70801f2e2858efc1
DATA Pi<>+0x78(SB)/8, $0x6636920d871574e6
GLOBL Pi<>(SB), (NOPTR+RODATA), $0x80

DATA CryptedPi<>+0x00(SB)/8, $0x822233b93c11087c
DATA CryptedPi<>+0x08(SB)/8, $0xd2b32f4adde873da
DATA CryptedPi<>+0x10(SB)/8, $0xae9c2fc7dd17bcdb
DATA CryptedPi<>+0x18(SB)/8, $0x859110441a1569fc
DATA CryptedPi<>+0x20(SB)/8, $0x47087d794fffb5c9
DATA CryptedPi<>+0x28(SB)/8, $0xb7b6c8f565414445
DATA CryptedPi<>+0x30(SB)/8, $0xfd260edabb308f8d
DATA CryptedPi<>+0x38(SB)/8, $0x3ddefc67bc565a13
DATA CryptedPi<>+0x40(SB)/8, $0xe4c1d50223544f10
DATA CryptedPi<>+0x48(SB)/8, $0xaf40e05725c3192b
DATA CryptedPi<>+0x50(SB)/8, $0x281d8ab9a16382e9
DATA CryptedPi<>+0x58(SB)/8, $0xddc10c903b63a6cf
DATA CryptedPi<>+0x60(SB)/8, $0x852d3ad603e8df72
DATA CryptedPi<>+0x68(SB)/8, $0xa6642b57d1011deb
DATA CryptedPi<>+0x70(SB)/8, $0x5063d25a1cb7b6b9
DATA CryptedPi<>+0x78(SB)/8, $0xb2623e6241e8e46e
GLOBL CryptedPi<>(SB), (NOPTR+RODATA), $0x80

// func aesBytesBatchGenHashStates(data *[]byte, states *[3]uint64, length int)
// Requires: AES
TEXT ·aesBytesBatchGenHashStates(SB), NOSPLIT, $0-24
	MOVQ data+0(FP), SI
	MOVQ states+8(FP), DI
	MOVQ length+16(FP), CX

	VMOVDQU CryptedPi<>+0x00(SB), X0
	VMOVDQU CryptedPi<>+0x10(SB), X1
	VMOVDQU CryptedPi<>+0x20(SB), X2
	VMOVDQU CryptedPi<>+0x30(SB), X3
	VMOVDQU CryptedPi<>+0x40(SB), X4
	VMOVDQU CryptedPi<>+0x50(SB), X5
	VMOVDQU CryptedPi<>+0x60(SB), X6
	VMOVDQU CryptedPi<>+0x70(SB), X7

loop:
	MOVQ (SI), AX
	MOVQ 8(SI), DX
	MOVQ DX, BX

	ADDQ AX, DX
	SUBQ $0x40, DX

	VMOVDQU X0, X8
	VMOVDQU X1, X9
	VMOVDQU X2, X10
	VMOVDQU X3, X11
	VMOVDQU X4, X12
	VMOVDQU X5, X13
	VMOVDQU X6, X14
	VMOVDQU X7, X15

innerLoop:
	CMPQ AX, DX
	JGE  tail

	VAESENC 0x00(AX), X8, X8
	VAESENC 0x00(AX), X12, X12
	VAESENC 0x10(AX), X9, X9
	VAESENC 0x10(AX), X13, X13
	VAESENC 0x20(AX), X10, X10
	VAESENC 0x20(AX), X14, X14
	VAESENC 0x30(AX), X11, X11
	VAESENC 0x30(AX), X15, X15

	ADDQ $0x40, AX
	JMP  innerLoop

tail:
	ADDQ $0x30, DX
	CMPQ AX, DX
	JGE  done

	VAESENC (AX), X8, X8
	VAESENC (AX), X12, X12

	ADDQ $0x10, AX
	CMPQ AX, DX
	JGE  done

	VAESENC (AX), X9, X9
	VAESENC (AX), X13, X13

	ADDQ $0x10, AX
	CMPQ AX, DX
	JGE  done

	VAESENC (AX), X10, X10
	VAESENC (AX), X14, X14

done:
	VAESENC (DX), X11, X11
	VAESENC (DX), X15, X15

	VAESENC X9, X8, X8
	VAESENC X10, X11, X11
	VAESENC X8, X11, X11

	VAESENC X11, X11, X11
	VAESENC X11, X11, X11
	VAESENC X11, X11, X11

	VAESENC X14, X13, X13
	VAESENC X15, X12, X12
	VAESENC X13, X12, X12

	VPSHUFD $0x4e, X11, X8
	VPXOR   X8, X11, X11
	VMOVQ   X11, R8
	XORQ    BX, R8

	MOVQ    R8, (DI)
	VMOVDQU X12, 8(DI)

	ADDQ $24, SI
	ADDQ $24, DI
	DECQ CX
	JNZ  loop

	RET

// func aesInt192BatchGenHashStates(data *[3]uint64, states *[3]uint64, length int)
// Requires: AES
TEXT ·aesInt192BatchGenHashStates(SB), NOSPLIT, $0-24
	MOVQ data+0(FP), SI
	MOVQ states+8(FP), DI
	MOVQ length+16(FP), CX

	VMOVDQU CryptedPi<>+0x00(SB), X0
	VMOVDQU CryptedPi<>+0x10(SB), X1
	VMOVDQU CryptedPi<>+0x20(SB), X2
	VMOVDQU CryptedPi<>+0x30(SB), X3
	VMOVDQU CryptedPi<>+0x40(SB), X4
	VMOVDQU CryptedPi<>+0x50(SB), X5
	VMOVDQU CryptedPi<>+0x60(SB), X6
	VMOVDQU CryptedPi<>+0x70(SB), X7
	VAESENC X2, X3, X3
	VAESENC X7, X6, X6

loop:
	VAESENC 0x00(SI), X0, X8
	VAESENC 0x00(SI), X4, X10
	VAESENC 0x08(SI), X1, X9
	VAESENC 0x08(SI), X5, X11
	VAESENC X8, X9, X9
	VAESENC X3, X9, X9
	VAESENC X9, X9, X9
	VAESENC X9, X9, X9
	VPSHUFD $0x4e, X9, X8
	VPXOR   X8, X9, X9
	VAESENC X11, X10, X10
	VAESENC X6, X10, X10
	VMOVQ   X9, 0x00(DI)
	VMOVDQU X10, 0x08(DI)

	ADDQ $0x18, SI
	ADDQ $0x18, DI
	LOOP loop

done:
	RET

// func aesInt256BatchGenHashStates(data *[4]uint64, states *[3]uint64, length int)
// Requires: AES
TEXT ·aesInt256BatchGenHashStates(SB), NOSPLIT, $0-24
	MOVQ data+0(FP), SI
	MOVQ states+8(FP), DI
	MOVQ length+16(FP), CX

	VMOVDQU CryptedPi<>+0x00(SB), X0
	VMOVDQU CryptedPi<>+0x10(SB), X1
	VMOVDQU CryptedPi<>+0x20(SB), X2
	VMOVDQU CryptedPi<>+0x30(SB), X3
	VMOVDQU CryptedPi<>+0x40(SB), X4
	VMOVDQU CryptedPi<>+0x50(SB), X5
	VMOVDQU CryptedPi<>+0x60(SB), X6
	VMOVDQU CryptedPi<>+0x70(SB), X7
	VAESENC X2, X3, X3
	VAESENC X7, X6, X6

loop:
	VAESENC 0x00(SI), X0, X8
	VAESENC 0x00(SI), X4, X10
	VAESENC 0x10(SI), X1, X9
	VAESENC 0x10(SI), X5, X11
	VAESENC X8, X9, X9
	VAESENC X3, X9, X9
	VAESENC X9, X9, X9
	VAESENC X9, X9, X9
	VPSHUFD $0x4e, X9, X8
	VPXOR   X8, X9, X9
	VAESENC X11, X10, X10
	VAESENC X6, X10, X10
	VMOVQ   X9, 0x00(DI)
	VMOVDQU X10, 0x08(DI)

	ADDQ $0x20, SI
	ADDQ $0x18, DI
	LOOP loop

done:
	RET

// func aesInt320BatchGenHashStates(data *[5]uint64, states *[3]uint64, length int)
// Requires: AES
TEXT ·aesInt320BatchGenHashStates(SB), NOSPLIT, $0-24
	MOVQ data+0(FP), SI
	MOVQ states+8(FP), DI
	MOVQ length+16(FP), CX

	VMOVDQU CryptedPi<>+0x00(SB), X0
	VMOVDQU CryptedPi<>+0x10(SB), X1
	VMOVDQU CryptedPi<>+0x20(SB), X2
	VMOVDQU CryptedPi<>+0x30(SB), X3
	VMOVDQU CryptedPi<>+0x40(SB), X4
	VMOVDQU CryptedPi<>+0x50(SB), X5
	VMOVDQU CryptedPi<>+0x60(SB), X6
	VMOVDQU CryptedPi<>+0x70(SB), X7

loop:
	VAESENC 0x00(SI), X0, X8
	VAESENC 0x00(SI), X4, X11
	VAESENC 0x10(SI), X1, X9
	VAESENC 0x10(SI), X5, X12
	VAESENC 0x18(SI), X3, X10
	VAESENC 0x18(SI), X6, X13
	VAESENC X10, X8, X8
	VAESENC X2, X9, X9
	VAESENC X9, X8, X8
	VAESENC X8, X8, X8
	VAESENC X8, X8, X8
	VPSHUFD $0x4e, X8, X9
	VPXOR   X9, X8, X8
	VAESENC X12, X11, X11
	VAESENC X7, X13, X13
	VAESENC X13, X11, X11
	VMOVQ   X8, 0x00(DI)
	VMOVDQU X11, 0x08(DI)

	ADDQ $0x28, SI
	ADDQ $0x18, DI
	LOOP loop

done:
	RET

TEXT genCryptedPi(SB), NOSPLIT, $0-8
	MOVQ dst+0(FP), DI

	VMOVDQU Pi<>+0x00(SB), X0
	VMOVDQU Pi<>+0x10(SB), X1
	VMOVDQU Pi<>+0x20(SB), X2
	VMOVDQU Pi<>+0x30(SB), X3
	VMOVDQU Pi<>+0x40(SB), X4
	VMOVDQU Pi<>+0x50(SB), X5
	VMOVDQU Pi<>+0x60(SB), X6
	VMOVDQU Pi<>+0x70(SB), X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENCLAST X0, X0, X0
	VAESENCLAST X1, X1, X1
	VAESENCLAST X2, X2, X2
	VAESENCLAST X3, X3, X3
	VAESENCLAST X4, X4, X4
	VAESENCLAST X5, X5, X5
	VAESENCLAST X6, X6, X6
	VAESENCLAST X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENC X0, X0, X0
	VAESENC X1, X1, X1
	VAESENC X2, X2, X2
	VAESENC X3, X3, X3
	VAESENC X4, X4, X4
	VAESENC X5, X5, X5
	VAESENC X6, X6, X6
	VAESENC X7, X7, X7

	VAESENCLAST X0, X0, X0
	VAESENCLAST X1, X1, X1
	VAESENCLAST X2, X2, X2
	VAESENCLAST X3, X3, X3
	VAESENCLAST X4, X4, X4
	VAESENCLAST X5, X5, X5
	VAESENCLAST X6, X6, X6
	VAESENCLAST X7, X7, X7

	VMOVDQU X0, 0x00(DI)
	VMOVDQU X1, 0x10(DI)
	VMOVDQU X2, 0x20(DI)
	VMOVDQU X3, 0x30(DI)
	VMOVDQU X4, 0x40(DI)
	VMOVDQU X5, 0x50(DI)
	VMOVDQU X6, 0x60(DI)
	VMOVDQU X7, 0x70(DI)

	RET
